Data: The first look

The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from November 15 2020 to June 24 2021.

Date Site Cases SevenDayMACases N1 BCoV N2 PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt
2020-09-11 Madison 153 192.2857 NA NA NA NA NA NA NA NA
2020-09-12 Madison 181 194.0000 NA NA NA NA NA NA NA NA
2020-09-13 Madison 171 158.7143 NA NA NA NA NA NA NA NA
2020-09-14 Madison 96 154.7143 NA NA NA NA NA NA NA NA
2020-09-15 Madison 69 153.5714 NA NA NA NA NA NA NA NA
2020-09-16 Madison 231 148.1429 NA NA NA NA NA NA NA NA

The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First that wastewater data is very noisy. And that there is a hint of a relationship between the two signals.

FirstImpressionDF <- FullDF%>%
  NoNa("N1","Cases")

FirstImpression <- FirstImpressionDF%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=N1, color="N1",info=N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxFixing(PMMoV,N1), color="PMMoV",info=PMMoV))+
  geom_line(aes(y=MinMaxFixing(FlowRate,N1), color="FlowRate",info=FlowRate))+
  geom_line(aes(y=MinMaxFixing(BCoV,N1), color="BCoV",info=BCoV))+
  #geom_line(aes(y=MinMaxFixing(temperature,N1), color="temperature",info=temperature))+
  #geom_line(aes(y=MinMaxFixing(tss,N1), color="tss",info=tss))+
  geom_line(aes(y=MinMaxFixing(equiv_sewage_amt,N1), color="equiv_sewage_amt",info=equiv_sewage_amt))+
  geom_line(aes(y=MinMaxFixing(SevenDayMACases,N1), color="Seven Day MA Cases",info=Cases))+
  geom_line(aes(y=MinMaxFixing(N2,N1), color="N2",info=N2))+
  labs(y="N1")



ggplotly(FirstImpression)
#To remove weekend effects we are looking at the 7 day smoothing of cases.
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend

ErrorMarkedDF <- FullDF%>%#Remove older Var data that clearly has no relationship to Cases
  mutate(EarlyOutlier = ifelse(Date < mdy("11/20/2020"),TRUE,FALSE))#replace old. 
#"11/20/2020"
#"1/1/2021"

ErrorMarkedDF$DetectedOutlierN1 <- IdentifyOutliers(ErrorMarkedDF$N1)
ErrorMarkedDF$DetectedOutlierN2 <- IdentifyOutliers(ErrorMarkedDF$N2)  
ErrorMarkedDF$DetectedOutlierGeo <- IdentifyOutliers(ErrorMarkedDF$GeoMeanN12)
ErrorMarkedDF$DetectedOutlierPMMOV <- IdentifyOutliers(ErrorMarkedDF$PMMoV)
ErrorMarkedDF$DetectedOutlierBCOv <- IdentifyOutliers(ErrorMarkedDF$BCoV)

ErrorRemovedDF <- ErrorMarkedDF%>%
  mutate(PotentialOutlier = Date>mdy("03/31/2021")&mdy("05/12/2021")>Date)%>%
         select(-EarlyOutlier)#Removing unneeded calculated columns


ErrorMarkedDF%>%
  select(-EarlyOutlier)%>%
  #filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
  mutate(NumberOfOutliers = DetectedOutlierN1+DetectedOutlierN2+DetectedOutlierGeo)%>%
  group_by(NumberOfOutliers)%>%
  summarize(n())
## # A tibble: 4 x 2
##   NumberOfOutliers `n()`
##              <int> <int>
## 1                0   469
## 2                1     7
## 3                2     5
## 4                3     7
ErrorMarkedDF%>%
  select(-EarlyOutlier)%>%
  summarize(N1 = sum(DetectedOutlierN1), N2 = sum(DetectedOutlierN2), Geo = sum(DetectedOutlierGeo), N12 =sum(DetectedOutlierN1*DetectedOutlierN2), N1Geo =sum(DetectedOutlierN1*DetectedOutlierGeo), N2Geo =sum(DetectedOutlierN2*DetectedOutlierGeo), AllVars =sum(DetectedOutlierN1*DetectedOutlierN2*DetectedOutlierGeo))
##   N1 N2 Geo N12 N1Geo N2Geo AllVars
## 1 10 16  12   7     9    10       7
OutlierComp <- ErrorMarkedDF%>%
  filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
  mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
         Name = paste(Name,ifelse(DetectedOutlierN2,"N2","")),
         Name = paste(Name,ifelse(DetectedOutlierGeo,"Geo","")))%>%
  ggplot(aes(x=Date))+
  geom_point(aes(y=N1,shape="N1",color=Name))+
  geom_point(aes(y=N2,shape="N2",color=Name))+
  geom_point(aes(y=GeoMeanN12,shape="Geo",color=Name))
ggplotly(OutlierComp)
#,info1=N1,info2=GeoMeanN12

OutlierComp <- ErrorMarkedDF%>%
  filter(DetectedOutlierN1|DetectedOutlierPMMOV|DetectedOutlierBCOv)%>%
  mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
         Name = paste(Name,ifelse(DetectedOutlierPMMOV,"PMMoV","")),
         Name = paste(Name,ifelse(DetectedOutlierBCOv,"BCoV","")))%>%
  ggplot(aes(x=Date))+
  geom_point(aes(y=N1,color=Name,info1=PMMoV,info2=BCoV))
ggplotly(OutlierComp)
#ErrorMarkedDF%>%
#  select(-EarlyOutlier)%>%
#  write.csv(file="OutlierFlagDF")
#wwtp_comments
#analytical_comments
#ErrorRemovedDF%>%
#  filter(!is.na(analytical_comments))
CoVarGraphic <- ErrorRemovedDF%>%
  ggplot()+#LargeError
  aes(y = N1,color=DetectedOutlier,info = Date)
#PotentialOutlier#DetectedOutlier
CoVarGraphicPMMoV <- CoVarGraphic+
  geom_point(aes(x = PMMoV))
#Outliers
#26203622#29931423
#149752.0#461781.5
#Potintial outlier
#26463265#30048370
#172041.0#98442.5
CoVarGraphicFlowRate <- CoVarGraphic+
  geom_point(aes(x = FlowRate))
CoVarGraphicBCoV <- CoVarGraphic+
  geom_point(aes(x = BCoV))
CoVarGraphicequiv_sewage_amt <- CoVarGraphic+#temperature,equiv_sewage_amt
  geom_point(aes(x = equiv_sewage_amt))
#PMMoV,FlowRate,BCoV,temperature,equiv_sewage_amt
ggplotly(CoVarGraphicPMMoV)
ggplotly(CoVarGraphicBCoV)
ggplotly(CoVarGraphicFlowRate)
ggplotly(CoVarGraphicequiv_sewage_amt)

ErrorRemovedDF%>%
  group_by(DetectedOutlier)%>%
  summarize(median(PMMoV,na.rm = TRUE),sd(PMMoV,na.rm = TRUE),
            median(N1,na.rm = TRUE),sd(N1,na.rm = TRUE))


AA <- ErrorRemovedDF%>%
  ggplot(aes(x=Date,y=equiv_sewage_amt))+
  geom_point()
ggplotly(AA)
summary(lm(DetectedOutlier ~ PMMoV ,data = ErrorMarkedDF))

library(rpart)
fit <- rpart(DetectedOutlier ~ PMMoV + N1,
   method="class", data=ErrorMarkedDF)

printcp(fit)
plotcp(fit)
summary(fit)
plot(fit)
text(fit, use.n=TRUE, all=TRUE, cex=.5)

CoVarGraphicPMMoV <- CoVarGraphic+
  geom_point(aes(x = PMMoV))+
  geom_hline(yintercept = 5.61e5)+
  geom_vline(xintercept = 2.82e7)
ggplotly(CoVarGraphicPMMoV)


ErrorMarkedDF%>%
  ggplot(aes(x = DetectedOutlier, y = log(PMMoV))) +
        geom_boxplot()


t.test(PMMoV ~ DetectedOutlier, data = ErrorMarkedDF)

#26203622#29931423
#149752.0#461781.5